import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import metrics
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
vehicleData = pd.read_csv("vehicle.csv")
vehicleData.head(10)
#there are some fields which do not have any numbers, so we will replace them with median values
vehicleData.isnull().sum().T
vehicleData.shape
vehicleData.info()
#instead of dropping the rows, lets replace the missing values with median value.
vehicleData.iloc[:,0:18].median()
#instead of dropping the rows, lets replace the missing values with median value.
for i in vehicleData.columns[:18]:
median_value = vehicleData[i].median(numeric_only=False,axis=0)
vehicleData[i] = vehicleData[i].fillna(median_value)
vehicleData.head(10)
vehicleData.describe().T
plt.figure(figsize=(20,25))
f, (ax1, ax2,ax3,ax4,ax5,ax6,ax7,ax8,ax9,ax10) = plt.subplots(10, 1, figsize=(10, 75), sharex=True)
sns.boxplot(x="class",y="compactness",data=vehicleData,palette="BrBG",ax=ax1)
sns.swarmplot(x="class",y="compactness",data=vehicleData,color="0.25",ax=ax1)
sns.boxplot(x="class",y="circularity",data=vehicleData,palette="BrBG",ax=ax2)
sns.swarmplot(x="class",y="circularity",data=vehicleData,color="0.25",ax=ax2)
sns.boxplot(x="class",y="distance_circularity",data=vehicleData,palette="cubehelix",ax=ax3)
sns.swarmplot(x="class",y="distance_circularity",data=vehicleData,color="0.25",ax=ax3)
sns.boxplot(x="class",y="radius_ratio",data=vehicleData,palette="cubehelix",ax=ax4)
sns.swarmplot(x="class",y="radius_ratio",data=vehicleData,color="0.25",ax=ax4)
sns.boxplot(x="class",y="pr.axis_aspect_ratio",data=vehicleData,palette="BrBG",ax=ax5)
sns.swarmplot(x="class",y="pr.axis_aspect_ratio",data=vehicleData,color="0.25",ax=ax5)
sns.boxplot(x="class",y="max.length_aspect_ratio",data=vehicleData,palette="BrBG",ax=ax6)
sns.swarmplot(x="class",y="max.length_aspect_ratio",data=vehicleData,color="0.25",ax=ax6)
sns.boxplot(x="class",y="scatter_ratio",data=vehicleData,palette="cubehelix",ax=ax7)
sns.swarmplot(x="class",y="scatter_ratio",data=vehicleData,color="0.25",ax=ax7)
sns.boxplot(x="class",y="elongatedness",data=vehicleData,palette="cubehelix",ax=ax8)
sns.swarmplot(x="class",y="elongatedness",data=vehicleData,color="0.25",ax=ax8)
sns.boxplot(x="class",y="pr.axis_rectangularity",data=vehicleData,palette="BrBG",ax=ax9)
sns.swarmplot(x="class",y="pr.axis_rectangularity",data=vehicleData,color="0.25",ax=ax9)
sns.boxplot(x="class",y="max.length_rectangularity",data=vehicleData,palette="BrBG",ax=ax10)
sns.swarmplot(x="class",y="max.length_rectangularity",data=vehicleData,color="0.25",ax=ax10)
plt.setp(f.axes, yticks=[])
plt.tight_layout(h_pad=2)
plt.show
plt.figure(figsize=(20,25))
f, (ax1, ax2,ax3,ax4,ax5,ax6,ax7,ax8,ax9) = plt.subplots(9, 1, figsize=(10, 60), sharex=True)
sns.boxplot(x="class",y="scaled_variance",data=vehicleData,palette="BrBG",ax=ax1)
sns.swarmplot(x="class",y="scaled_variance",data=vehicleData,color="0.25",ax=ax1)
sns.boxplot(x="class",y="scaled_variance.1",data=vehicleData,palette="BrBG",ax=ax2)
sns.swarmplot(x="class",y="scaled_variance.1",data=vehicleData,color="0.25",ax=ax2)
sns.boxplot(x="class",y="distance_circularity",data=vehicleData,palette="cubehelix",ax=ax3)
sns.swarmplot(x="class",y="distance_circularity",data=vehicleData,color="0.25",ax=ax3)
sns.boxplot(x="class",y="scaled_radius_of_gyration",data=vehicleData,palette="cubehelix",ax=ax4)
sns.swarmplot(x="class",y="scaled_radius_of_gyration",data=vehicleData,color="0.25",ax=ax4)
sns.boxplot(x="class",y="scaled_radius_of_gyration.1",data=vehicleData,palette="BrBG",ax=ax5)
sns.swarmplot(x="class",y="scaled_radius_of_gyration.1",data=vehicleData,color="0.25",ax=ax5)
sns.boxplot(x="class",y="max.length_aspect_ratio",data=vehicleData,palette="BrBG",ax=ax6)
sns.swarmplot(x="class",y="max.length_aspect_ratio",data=vehicleData,color="0.25",ax=ax6)
sns.boxplot(x="class",y="skewness_about",data=vehicleData,palette="cubehelix",ax=ax7)
sns.swarmplot(x="class",y="skewness_about",data=vehicleData,color="0.25",ax=ax7)
sns.boxplot(x="class",y="skewness_about.1",data=vehicleData,palette="cubehelix",ax=ax8)
sns.swarmplot(x="class",y="skewness_about.1",data=vehicleData,color="0.25",ax=ax8)
sns.boxplot(x="class",y="skewness_about.2",data=vehicleData,palette="BrBG",ax=ax9)
sns.swarmplot(x="class",y="skewness_about.2",data=vehicleData,color="0.25",ax=ax9)
sns.boxplot(x="class",y="hollows_ratio",data=vehicleData,palette="BrBG",ax=ax10)
sns.swarmplot(x="class",y="hollows_ratio",data=vehicleData,color="0.25",ax=ax10)
plt.setp(f.axes, yticks=[])
plt.tight_layout(h_pad=2)
plt.show
#EDA - compare all attributes visually to check for relationships that can be exploited
sns.pairplot(vehicleData,diag_kind='kde')
For ex:
# We will perform same experiment with GridSearch
from sklearn.model_selection import train_test_split
X = vehicleData.drop("class" , axis=1)
y = vehicleData.pop("class")
# could get better results with a model with a test size of 0.7 and train size=0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.70, random_state=42)
# run the Support vector classifier along with identifying the right set of parameters using grid search.
from sklearn import svm
from sklearn.model_selection import GridSearchCV
#for kernel='rbf' gamma=scaled C=1,10,100,1000
#for kernel='poly' gamma=scaled C=1,10,100,1000
parameters = {'kernel':('poly', 'rbf'), 'C':(1,10,100,1000),'cache_size':[3000],'gamma':['scale']}
svc = svm.SVC()
print(svc.get_params())
clf = GridSearchCV(svc, parameters,n_jobs= -1)
clf.fit(X_train, y_train)
print("Score for training data::",clf.score(X_train, y_train))
print("Score for testing data::",clf.score(X_test, y_test))
print("Best set of hyper parameters:::",clf.best_params_)
# K-fold cross-validation - tried with different folds - 5, 10, 20 and found that the
# accuracy scores are consistent even after splitting into different folds.
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kfold = KFold(n_splits=10, random_state=65)
results = cross_val_score(clf,X, y, cv=kfold)
print("CV matrix for fold of 10 \n",results)
print("Mean for fold of 10", results.mean())
kfold = KFold(n_splits=5, random_state=100)
results = cross_val_score(clf,X, y, cv=kfold)
print("CV matrix for fold of 5 \n",results)
print("Mean for fold of 10", results.mean())
kfold = KFold(n_splits=20, random_state=8153)
results = cross_val_score(clf,X, y, cv=kfold)
print("CV matrix for fold of 20 \n",results)
print("Mean for fold of 10", results.mean())
# Performing PCA to extract the key set of features
from sklearn.decomposition import PCA
from scipy.stats import zscore
# Initially calculate Z-scores
x_zcores=X.apply(zscore)
x_zcores.head()
# Calculate the covariance matrix
cov_matrix = np.cov(x_zcores,rowvar=False)
print(cov_matrix)
# extract the principal components, we will use 'mle' so that the algo determines the components
pca = PCA(n_components='mle')
pca.fit(cov_matrix)
# explained variance by each vector
print(pca.explained_variance_)
# variance ratio for each vector
print(pca.explained_variance_ratio_)
sns.lineplot(list(range(1,18)),np.cumsum(pca.explained_variance_ratio_),drawstyle='steps-post')
pca3 = PCA(n_components=7)
pca3.fit(x_zcores)
print(pca3.components_)
print(pca3.explained_variance_ratio_)
Xpca3 = pca3.transform(x_zcores)
sns.pairplot(pd.DataFrame(Xpca3),diag_kind='kde')
# We will perform same experiment with GridSearch
from sklearn.model_selection import train_test_split
# could get better results with a model with a test size of 0.7 and train size=0.3
X_train_pca3, X_test_pca3, y_train_pca3, y_test_pca3 = train_test_split(Xpca3, y, test_size=.70, random_state=42)
# run the Support vector classifier along with identifying the right set of parameters using grid search.
from sklearn import svm
from sklearn.model_selection import GridSearchCV
#for kernel='rbf' gamma=scaled C=1,10,100,1000
#for kernel='poly' gamma=scaled C=1,10,100,1000
parameters = {'kernel':('poly', 'rbf'), 'C':(1,10,100,1000),'cache_size':[3000],'gamma':['scale']}
svc = svm.SVC()
print(svc.get_params())
clf = GridSearchCV(svc, parameters,n_jobs= -1)
clf.fit(X_train_pca3, y_train_pca3)
print("Score for training data::",clf.score(X_train_pca3, y_train_pca3))
print("Score for testing data::",clf.score(X_test_pca3, y_test_pca3))
print("Best set of hyper parameters:::",clf.best_params_)
# K-fold cross-validation - tried with different folds - 5, 10, 20 and found that the
# accuracy scores are consistent even after splitting into different folds.
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kfold = KFold(n_splits=10, random_state=65)
results = cross_val_score(clf,Xpca3, y, cv=kfold)
print("CV matrix for fold of 10 \n",results)
print("Mean for fold of 10", results.mean())
kfold = KFold(n_splits=5, random_state=100)
results = cross_val_score(clf,Xpca3, y, cv=kfold)
print("CV matrix for fold of 5 \n",results)
print("Mean for fold of 5", results.mean())
kfold = KFold(n_splits=20, random_state=8153)
results = cross_val_score(clf,Xpca3, y, cv=kfold)
print("CV matrix for fold of 20 \n",results)
print("Mean for fold of 20", results.mean())
Final Result: SVM was performed (to arrive at a predictive model) on raw data and the data after after performing PCA. The scores for raw data is 0.97 and the scores for data after PCA is 0.93. So the dimensionlity reduction does not result in much loss of information